targeted_normal_read_table <- readRDS("../Gaudin_thesis_data/normal_reads_table.rds")
SNP_targets_info <- read.table("../Gaudin_thesis_data/SNP_targets_info.csv", stringsAsFactors = F, header = FALSE, sep="\t")
SNP_targets_info <- SNP_targets_info[,c(4,5)]
targeted_normal_read_table[[1]]$chrom_and_position <- paste(targeted_normal_read_table[[1]]$Chromosome, ":",targeted_normal_read_table[[1]]$Position, "-",targeted_normal_read_table[[1]]$Position, sep="")
targeted_normal_read_table[[2]]$chrom_and_position <- paste(targeted_normal_read_table[[2]]$Chromosome, ":",targeted_normal_read_table[[2]]$Position, "-",targeted_normal_read_table[[2]]$Position, sep="")
#targeted_normal_read_table[[1]]$chrom_and_position
targeted_normal_read_table[[1]] <- merge(targeted_normal_read_table[[1]], SNP_targets_info, by.x = "chrom_and_position", by.y = "V5")
targeted_normal_read_table[[2]] <- merge(targeted_normal_read_table[[2]], SNP_targets_info, by.x = "chrom_and_position", by.y = "V5")
targeted_tumor_read_table <- readRDS("../Gaudin_thesis_data/tumor_reads_table.rds")
targeted_tumor_read_table[[1]]$chrom_and_position <- paste(targeted_tumor_read_table[[1]]$Chromosome, ":",targeted_tumor_read_table[[1]]$Position, "-",targeted_tumor_read_table[[1]]$Position, sep="")
targeted_tumor_read_table[[2]]$chrom_and_position <- paste(targeted_tumor_read_table[[2]]$Chromosome, ":",targeted_tumor_read_table[[2]]$Position, "-",targeted_tumor_read_table[[2]]$Position, sep="")
#targeted_tumor_read_table[[1]]$chrom_and_position
targeted_tumor_read_table[[1]] <- merge(targeted_tumor_read_table[[1]], SNP_targets_info, by.x = "chrom_and_position", by.y = "V5")
targeted_tumor_read_table[[2]] <- merge(targeted_tumor_read_table[[2]], SNP_targets_info, by.x = "chrom_and_position", by.y = "V5")
st_alt_table <- targeted_normal_read_table[[1]][,5:ncol(targeted_normal_read_table[[1]]) - 1] ## remove chromosome and position
st_cov_table <- targeted_normal_read_table[[2]][,5:ncol(targeted_normal_read_table[[2]]) - 1] ## remove chromosome and position
avg_alts <- c()
avg_covs <- c()
pool_vector <- c()
sds <- c()
het_count <- c()
for(i in 1:nrow(st_alt_table)){
indices <- which(st_alt_table[i,] > 5 & st_alt_table[i,] < 95 & st_cov_table[i,] > 250) ##only include SNP sites with VAF frequency between 25 - 75 and coverage > 100,
if(length(indices) > 7){ ##only average SNPs that are heterozygous at at least 7 SNP sites
avg_alts <- c(avg_alts, mean(as.numeric(st_alt_table[i,indices])))
avg_covs <- c(avg_covs, mean(as.numeric(st_cov_table[i,indices])))
pool_vector <- c(pool_vector, targeted_normal_read_table[[1]]$V4[i])
sds <- c(sds, sd(as.numeric(st_alt_table[i,indices])))
het_count <- c(het_count, length(indices))
}
}
cov_df <- data.frame(Average_vaf = avg_alts, Average_coverage = avg_covs, pool = pool_vector,Standard_devs = sds, het_count = het_count)
cov_df <- cov_df[!is.na(cov_df$Average_vaf),]
print("number of SNPs included")
## [1] "number of SNPs included"
print(nrow(cov_df))## print number of SNPs
## [1] 487
print(ggplot(cov_df, aes(x=Average_coverage, y=Average_vaf, color = pool)) + xlab("SNP Site Average Coverage") + ylab("Mean VAF for SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed") + ylim(5,95) + xlim(0,2500))
print(ggplot(cov_df, aes(x=Average_coverage, y=Standard_devs, shape = pool, color = het_count )) + xlab("SNP Site Average Coverage") + ylab("Standard Dev for het SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed") + xlim(0,2500))
minor_alleles <- c()
for(i in 1:nrow(cov_df)){
if(cov_df$Average_vaf[i] >= 50){
minor_alleles <- c(minor_alleles, 100 - cov_df$Average_vaf[i])
}else{
minor_alleles <- c(minor_alleles, cov_df$Average_vaf[i])
}
}
cov_df$minor_allels <- minor_alleles
print(ggplot(cov_df, aes(x=Average_coverage, y=minor_alleles, color = pool)) + xlab("SNP Site Average Coverage") + ylab("Mean VAF for minor allele SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed"))
st_alt_table <- targeted_tumor_read_table[[1]][,5:ncol(targeted_tumor_read_table[[1]]) - 1] ## remove chromosome and position and pool
st_cov_table <- targeted_tumor_read_table[[2]][,5:ncol(targeted_tumor_read_table[[2]]) - 1] ## remove chromosome and position and pool
avg_alts <- c()
avg_covs <- c()
pool_vector <- c()
sds <- c()
het_count <- c()
for(i in 1:nrow(st_alt_table)){
indices <- which(st_alt_table[i,] > 5 & st_alt_table[i,] < 95 & st_cov_table[i,] > 250) ##only include SNP sites with VAF frequency between 25 - 75 and coverage > 100,
if(length(indices) > 7){ ##only average SNPs that are heterozygous at at least 7 SNP sites
avg_alts <- c(avg_alts, mean(as.numeric(st_alt_table[i,indices])))
avg_covs <- c(avg_covs, mean(as.numeric(st_cov_table[i,indices])))
pool_vector <- c(pool_vector, targeted_tumor_read_table[[1]]$V4[i])
sds <- c(sds, sd(as.numeric(st_alt_table[i,indices])))
het_count <- c(het_count, length(indices))
}
}
cov_df <- data.frame(Average_vaf = avg_alts, Average_coverage = avg_covs, pool = pool_vector,Standard_devs = sds)
cov_df <- cov_df[!is.na(cov_df$Average_vaf),]
print("number of SNPs included")
## [1] "number of SNPs included"
print(nrow(cov_df))## print number of SNPs
## [1] 512
print(ggplot(cov_df, aes(x=Average_coverage, y=Average_vaf, color = pool)) + xlab("SNP Site Average Coverage") + ylab("Mean VAF for SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed") + ylim(5,95) + xlim(0,3500))
print(ggplot(cov_df, aes(x=Average_coverage, y=Standard_devs, shape = pool, color = het_count )) + xlab("SNP Site Average Coverage") + ylab("Standard Dev for het SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed")+ xlim(0,3500))
gene_table <- read.table("GENES_chrom_start_stop.txt", header=TRUE)
##adjust for targeting of genes for +/- 1MB , but dont overlap MSH2 and MSH6
gene_table$Start[-c(4)] <- gene_table$Start[-c(4)] - 1000000
gene_table$Stop[-c(3)] <- gene_table$Stop[-c(3)] + 1000000
print(gene_table)
## GENE Chromosome Start Stop
## 1 BRCA2 13 31889611 33973809
## 2 BRCA1 17 40196312 42277500
## 3 MSH2 2 46630108 47789450
## 4 MSH6 2 47922669 49037240
## 5 MLH1 3 36034823 38107380
## 6 PMS2 7 5012870 7048756
## 7 CDK4 12 57141510 59149796
## 8 MDM2 12 68201956 70239214
## 9 ERBB2 17 36844167 38886697
## 10 EGFR 7 54086714 56324313
## 11 MET 7 115312444 117438440
## 12 MYC 8 127747680 129753680
normal_targeted_het_counts <- gene_het_count_table(gene_table,targeted_normal_read_table,25,75,100)
for(i in 1:ncol(normal_targeted_het_counts)){
print(hist(normal_targeted_het_counts[,i], breaks = max(normal_targeted_het_counts[,i]), main=paste("Heterozygous SNPs for Each Sample (n = 47) Targeting", colnames(normal_targeted_het_counts)[i]), xlab="Number of Heterozygous SNPs", ylab="Number of Samples"))
}
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##
## $counts
## [1] 10 1 2 1 4 3 2 3 2 2 8 3 4 4
##
## $density
## [1] 0.20408163 0.02040816 0.04081633 0.02040816 0.08163265 0.06122449
## [7] 0.04081633 0.06122449 0.04081633 0.04081633 0.16326531 0.06122449
## [13] 0.08163265 0.08163265
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##
## $counts
## [1] 22 4 4 0 0 0 0 0 0 0 0 1 1 1 16
##
## $density
## [1] 0.44897959 0.08163265 0.08163265 0.00000000 0.00000000 0.00000000
## [7] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.02040816
## [13] 0.02040816 0.02040816 0.32653061
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##
## $counts
## [1] 24 0 0 1 0 0 2 0 0 1 1 1 3 2 14
##
## $density
## [1] 0.48979592 0.00000000 0.00000000 0.02040816 0.00000000 0.00000000
## [7] 0.04081633 0.00000000 0.00000000 0.02040816 0.02040816 0.02040816
## [13] 0.06122449 0.04081633 0.28571429
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## $counts
## [1] 6 0 3 11 2 3 4 6 5 6 1 0 2
##
## $density
## [1] 0.12244898 0.00000000 0.06122449 0.22448980 0.04081633 0.06122449
## [7] 0.08163265 0.12244898 0.10204082 0.12244898 0.02040816 0.00000000
## [13] 0.04081633
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
##
## $counts
## [1] 23 4 0 0 1 0 0 0 0 1 0 1 0 1 1 10 7
##
## $density
## [1] 0.46938776 0.08163265 0.00000000 0.00000000 0.02040816 0.00000000
## [7] 0.00000000 0.00000000 0.00000000 0.02040816 0.00000000 0.02040816
## [13] 0.00000000 0.02040816 0.02040816 0.20408163 0.14285714
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## $counts
## [1] 17 0 2 0 3 6 8 7 0 1 1 1 3
##
## $density
## [1] 0.34693878 0.00000000 0.04081633 0.00000000 0.06122449 0.12244898
## [7] 0.16326531 0.14285714 0.00000000 0.02040816 0.02040816 0.02040816
## [13] 0.06122449
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
##
## $counts
## [1] 12 1 6 1 2 6 2 1 1 1 2 2 2 1 1 4 1 3
##
## $density
## [1] 0.24489796 0.02040816 0.12244898 0.02040816 0.04081633 0.12244898
## [7] 0.04081633 0.02040816 0.02040816 0.02040816 0.04081633 0.04081633
## [13] 0.04081633 0.02040816 0.02040816 0.08163265 0.02040816 0.06122449
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## $counts
## [1] 4 2 4 3 4 3 4 8 4 3 4 2 4
##
## $density
## [1] 0.08163265 0.04081633 0.08163265 0.06122449 0.08163265 0.06122449
## [7] 0.08163265 0.16326531 0.08163265 0.06122449 0.08163265 0.04081633
## [13] 0.08163265
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
##
## $counts
## [1] 20 7 1 2 0 2 0 2 1 2 0 0 0 0 3 0 5 0 4
##
## $density
## [1] 0.40816327 0.14285714 0.02040816 0.04081633 0.00000000 0.04081633
## [7] 0.00000000 0.04081633 0.02040816 0.04081633 0.00000000 0.00000000
## [13] 0.00000000 0.00000000 0.06122449 0.00000000 0.10204082 0.00000000
## [19] 0.08163265
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## $counts
## [1] 3 2 4 5 5 5 8 3 4 5 3 1 1
##
## $density
## [1] 0.06122449 0.04081633 0.08163265 0.10204082 0.10204082 0.10204082
## [7] 0.16326531 0.06122449 0.08163265 0.10204082 0.06122449 0.02040816
## [13] 0.02040816
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11
##
## $counts
## [1] 16 6 0 3 2 6 1 1 1 8 5
##
## $density
## [1] 0.32653061 0.12244898 0.00000000 0.06122449 0.04081633 0.12244898
## [7] 0.02040816 0.02040816 0.02040816 0.16326531 0.10204082
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9
##
## $counts
## [1] 10 5 3 2 2 14 8 2 3
##
## $density
## [1] 0.20408163 0.10204082 0.06122449 0.04081633 0.04081633 0.28571429
## [7] 0.16326531 0.04081633 0.06122449
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
#normal_targeted_het_counts
clin_samples <- read.table("../Gaudin_thesis_data/clin_samples_standard.txt")$V1 ##sample names for clinical files
clin_sample_paths <- paste("../Gaudin_thesis_data/standard/",clin_samples,".dat",sep="")
clin_sample_counts_merged <- merge_count_files(clin_sample_paths) ##merged SNP counts file
normal_samples <- read.table("../Gaudin_thesis_data/nova_standard_names.txt")$V1 ##sample names for normal files
normal_sample_paths <- paste("../Gaudin_thesis_data/nova_standard/",normal_samples,".dat",sep="")
normal_sample_counts_merged <- merge_count_files(normal_sample_paths) ##merged SNP counts file
saveRDS(normal_sample_counts_merged,"normal_table_merged.rds")
saveRDS(clin_sample_counts_merged,"clinical_table_merged.rds")
clin_samples <- read.table("../Gaudin_thesis_data/clin_samples_standard.txt")$V1 ##sample names for clinical files
normal_samples <- read.table("../Gaudin_thesis_data/nova_standard_names.txt")$V1 ##sample names for normal files
tumor_countsmerged <- readRDS("../Gaudin_thesis_data/clinical_table_merged.rds") ##load saved counts_merged file
normal_countsmerged <- readRDS("../Gaudin_thesis_data/normal_table_merged.rds") ##load saved counts_merged file
normal_and_clinical_sample_names <- c(as.character(normal_samples),as.character(clin_samples)) ##combine into single list
normal_tables <- format_het_table(normal_countsmerged,5,95) ##[[1]]: VAF frequency [[2]] coverage at SNP site
tumor_tables <- format_het_table(tumor_countsmerged,5,95) ##[[1]]: VAF frequency [[2]] coverage at SNP site
#saveRDS(tumor_tables, "../Gaudin_thesis_data/tumor_reads_table.rds")
#saveRDS(normal_tables, "../Gaudin_thesis_data/normal_reads_table.rds")
complete_vaf_df <- VAF_Z_Score_analysis(normal_tables, tumor_tables, 250, 5, 95, 5, 95, 7,normal_and_clinical_sample_names)
saveRDS(complete_vaf_df,"../Gaudin_thesis_data/cdf_250_5_95.rds")
complete_vaf_df <- readRDS("../Gaudin_thesis_data/cdf_250_5_95.rds")
#complete_vaf_df
gene_specific_z_scores <- gene_specific_tables(gene_table, tumor_tables, normal_tables, clin_samples, 5,95,5,95,7,250)
gene_z_df <- gene_specific_z_scores[[1]]
gene_counts_df <- gene_specific_z_scores[[2]]
#saveRDS(gene_z_df,"../Gaudin_thesis_data/clin_gene_z.rds")
#saveRDS(gene_counts_df,"../Gaudin_thesis_data/clin_gene_counts.rds")
sample_names <- read.table("../Gaudin_thesis_data/nova_clin_names.txt")$V1
sample_files <- read.table("../Gaudin_thesis_data/nova_clin_tlen_paths.txt")$V1
for(i in 1:length(sample_files)){
frag_lengths <- read.table(as.character(sample_files[i]))$V1
if(i == 1){
nt_frags_binsize_5 <- bin_fragments(frag_lengths,5,1,600,as.character(sample_names[i]))
}else{
nt_frags_binsize_5 <- merge(nt_frags_binsize_5,bin_fragments(frag_lengths,5,1,600,sample_names[i]), by=c('start','end'))
}
print(i)
}
saveRDS(nt_frags_binsize_5,"../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table <- readRDS("../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table$start <- paste("s",frag_table$start,"_",frag_table$end,sep="")
frag_table_starts_ends <- frag_table$start
frag_table <- frag_table[,-c(1,2)]
frag_table <- t(frag_table)
colnames(frag_table) <- frag_table_starts_ends
frag_table <- as.data.frame(frag_table)
frag_table <- frag_table[,-c(ncol(frag_table))]
frag_table$sample_names <- row.names(frag_table)
frag_table <- normalize_frag_table(frag_table)
fragment_bins_to_analyze <- readRDS("../Gaudin_thesis_data/fragment_bins_to_analyze.rds")
frag_table <- frag_table[,fragment_bins_to_analyze]
#frag_table <- frag_table[,c(c(26:31),c(35:40),c(51:75))] #bins between 125 - 155, 170 - 200, 250 - 375
frag_table_normal <- frag_table[c(1:47),]
frag_table_clin <- frag_table[c(48:nrow(frag_table)),]
frag_table_normal$sample_names = rownames(frag_table_normal)
frag_table_clin$sample_names = rownames(frag_table_clin)
frag_avg_z_scores <- calculate_frag_z_scores(frag_table_clin,frag_table_normal)
saveRDS(frag_avg_z_scores,"../Gaudin_thesis_data/clin_frag_z_scores_5.rds")
clin_mut_percent_table <- readRDS("../Gaudin_thesis_data/clin_ctDNA_est_table.rds")
clin_sample_key_table <- readRDS("../Gaudin_thesis_data/key_reference.rds") ##data containing other sample identifiers
complete_vaf_df$sample_names <- str_replace(complete_vaf_df$sample_names, "_DONOR", "-DONOR") ##adjust to fit sample names in other files
cnv_table <- read.table("../Gaudin_thesis_data/data_CNA.txt", header = TRUE) ##table showing called cnvs
####formatting for merging######
colnames(cnv_table) <- str_replace_all(colnames(cnv_table), "[.]", "-")
row.names(cnv_table) <- cnv_table$Hugo_Symbol
cnv_table <- cnv_table[,-c(1)]
cnv_table <- t(cnv_table)
cnv_table <- data.frame(cnv_table) ##convert from atomic vector
##egfr flagged mutaions from cbio
egfr_cnv <- data.frame(Tumor_Sample_Barcode = row.names(cnv_table), egfr_cn = cnv_table$EGFR)
all_data_table <- merge(clin_mut_percent_table, egfr_cnv, all = TRUE)
all_data_table$mean_AF[is.na(all_data_table$mean_AF)] <- 0
all_data_table <- merge(all_data_table, clin_sample_key_table, by.x = "Tumor_Sample_Barcode", by.y = "id_1") ##id_1 = tumor sample barcode, id_2 == sample name
all_data_table <- merge(complete_vaf_df,all_data_table,by.x = "sample_names", by.y = "id_2", all.x = TRUE)
all_data_table$mean_AF[is.na(all_data_table$mean_AF)] <- 0
frag_avg_z_scores$sample_names <- str_replace(frag_avg_z_scores$sample_names, "_DONOR","-DONOR")
all_data_table <- clin_total_table <- merge(all_data_table,frag_avg_z_scores,by = "sample_names")
all_data_table <- all_data_table[-c(which(all_data_table$pool == "tumor" & is.na(all_data_table$Tumor_Sample_Barcode))),] ##remove tumor samples (4) which don't have egfr or barcode data
##add clinical patient IDs
patient_ids <- all_data_table$Tumor_Sample_Barcode
patient_ids <-str_replace_all(patient_ids, "-T.*", "")
all_data_table$patient_ids <- patient_ids
##add egfr gene specific Z scores
egfr_z_scores <- data.frame(sample_names = gene_z_df$sample_name, egfr_z_score = gene_z_df$EGFR, egfr_het_SNP_count = gene_counts_df$EGFR)
all_data_table <- merge(all_data_table, egfr_z_scores, by.x = "sample_names", all.x = TRUE)
#all_data_table
##mark samples with same patient id that show copy number flagged in some samples but not others (-1 means flagged for CNV in other patient samples)
updated_egfr_cn <- c()
for(i in 1:nrow(all_data_table)){
if(is.na(all_data_table$egfr_cn[i])){
updated_egfr_cn <- c(updated_egfr_cn,NA)
}
else{
if(all_data_table$egfr_cn[i] == 2){
#print("hi")
updated_egfr_cn <- c(updated_egfr_cn, 2)
}
else{
if(sum(all_data_table$egfr_cn[which(all_data_table$patient_ids == all_data_table$patient_ids[i])]) > 0){
updated_egfr_cn <- c(updated_egfr_cn, -1)
}
else{
updated_egfr_cn <- c(updated_egfr_cn, 0)
}
}
}
}
all_data_table$updated_egfr_cn <- updated_egfr_cn
ggplot(all_data_table, aes(x = pool, y = VAF_Z_Score)) + geom_boxplot() + xlab("Pool") + ylab("Mean Absolute heterozygous SNP VAF Z-score")
#### Boxplot for non-absolute average VAF Z Scores
ggplot(all_data_table, aes(x = pool, y = VAF_Z_NA)) + geom_boxplot() + xlab("Pool") + ylab("Mean Absolute heterozygous SNP VAF Z-score")
a <- ggplot() + xlab("Average SNP VAF Z-Score") + ylab("Sample Count")
a <- a + geom_histogram(data = all_data_table[which(all_data_table$pool == "tumor"),], aes(x=VAF_Z_Score, fill=pool, color=pool, position="identity"), bins = 50)
## Warning: Ignoring unknown aesthetics: position
a <- a + geom_histogram(data = all_data_table[which(all_data_table$pool == "normal"),], aes(x=VAF_Z_Score, fill=pool, color=pool, position="identity"), bins = 50)
## Warning: Ignoring unknown aesthetics: position
b <- a + xlim(0.5,2)
print(a)
print(b)
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
print("normal min and max")
## [1] "normal min and max"
print(min(all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score))
## [1] 0.7549912
print(max(all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score))
## [1] 1.051833
print("tumor min and max")
## [1] "tumor min and max"
print(min(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score))
## [1] 0.6714352
print(max(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score))
## [1] 7.854321
p <- ggplot()
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=VAF_Z_Score,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=VAF_Z_Score,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score)), color = "purple", linetype="dashed")
print(p)
g <- p + xlim(0,0.3) + ylim(0.7,2) + geom_vline(aes(xintercept=0.05),color = "dark green", linetype="dashed") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score)), color = "purple", linetype="dashed")
print(g)
## Warning: Removed 70 rows containing missing values (geom_point).
p <- ggplot()
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=q75,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=q75,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score")
print(p)
g <- p + xlim(0,0.3) + ylim(0.7,2)
print(g)
## Warning: Removed 89 rows containing missing values (geom_point).
p <- ggplot()
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=chrom_single_max,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=chrom_single_max,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score")
print(p)
g <- p + xlim(0,0.3) + ylim(0.7,2)
print(g)
## Warning: Removed 159 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
cor.test(all_data_table$chrom_single_max, all_data_table$mean_AF)
##
## Pearson's product-moment correlation
##
## data: all_data_table$chrom_single_max and all_data_table$mean_AF
## t = 31.776, df = 517, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7819044 0.8404833
## sample estimates:
## cor
## 0.813244
cor.test(all_data_table$chrom_double_max, all_data_table$mean_AF)
##
## Pearson's product-moment correlation
##
## data: all_data_table$chrom_double_max and all_data_table$mean_AF
## t = 32.516, df = 517, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7891013 0.8459143
## sample estimates:
## cor
## 0.8195115
cor.test(all_data_table$chrom_triple_max, all_data_table$mean_AF)
##
## Pearson's product-moment correlation
##
## data: all_data_table$chrom_triple_max and all_data_table$mean_AF
## t = 32.183, df = 517, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7859007 0.8435005
## sample estimates:
## cor
## 0.8167251
cor.test(all_data_table$VAF_Z_Score, all_data_table$mean_AF)
##
## Pearson's product-moment correlation
##
## data: all_data_table$VAF_Z_Score and all_data_table$mean_AF
## t = 28.277, df = 517, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7430741 0.8109762
## sample estimates:
## cor
## 0.7793024
p <- ggplot()
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=chrom_double_max,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=chrom_double_max,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score")
print(p)
g <- p + xlim(0,0.3) + ylim(0.7,2)
print(g)
## Warning: Removed 138 rows containing missing values (geom_point).
p <- ggplot()
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=chrom_triple_max,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=chrom_triple_max,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max)), color = "purple", linetype="dashed")
print(p)
g <- p + xlim(0,0.3) + ylim(0.8,2) + geom_vline(aes(xintercept=0.05),color = "dark green", linetype="dashed") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max)), color = "purple", linetype="dashed")
print(g)
## Warning: Removed 126 rows containing missing values (geom_point).
print("normal min and max")
## [1] "normal min and max"
print(min(all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max))
## [1] 0.9901933
print(max(all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max))
## [1] 1.605519
print("tumor min and max")
## [1] "tumor min and max"
print(min(all_data_table[which(all_data_table$pool == "tumor"),]$chrom_triple_max))
## [1] 0.9665942
print(max(all_data_table[which(all_data_table$pool == "tumor"),]$chrom_triple_max))
## [1] 19.39982
density_normal <- data.frame(samples = rep("normal", nrow(all_data_table[which(all_data_table$pool == "normal"),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score)
density_0 <- data.frame(samples = rep("ctDNA = 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),]$VAF_Z_Score)
density_all <- data.frame(samples = rep("All tumor", nrow(all_data_table[which(all_data_table$pool == "tumor"),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score)
density_above_0 <- data.frame(samples = rep("ctDNA > 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$VAF_Z_Score)
density_above_0.05 <- data.frame(samples = rep("ctDNA > 0.05", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$VAF_Z_Score)
density_df <- rbind(density_normal, density_all, density_0, density_above_0, density_above_0.05)
#density_df
p <- ggplot(density_df, aes(x=VAF_Z_Score, color = samples)) + geom_density() + xlim(0.5,2) + scale_color_manual(values=c("tan4", "turquoise3", "mediumorchid2", "lightcoral","yellowgreen"))
p
## Warning: Removed 189 rows containing non-finite values (stat_density).
print("roc_df_0")
## [1] "roc_df_0"
roc_df_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF == 0),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 130
## [1] "Number of tumor samples above max normal vaf z"
## [1] 15
## [1] "Sensitivity"
## [1] 0.1153846
## [1] "AUC"
## [1] 0.4495908
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Number of tumor samples above max normal vaf z"
## [1] 150
## [1] "Sensitivity"
## [1] 0.3177966
## [1] "AUC"
## [1] 0.6208979
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Number of tumor samples above max normal vaf z"
## [1] 135
## [1] "Sensitivity"
## [1] 0.3947368
## [1] "AUC"
## [1] 0.6860147
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Number of tumor samples above max normal vaf z"
## [1] 102
## [1] "Sensitivity"
## [1] 0.953271
## [1] "AUC"
## [1] 0.9990058
roc_df_0$samples <- rep("ctDNA = 0", nrow(roc_df_0))
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_0,roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)
density_normal <- data.frame(samples = rep("normal", nrow(all_data_table[which(all_data_table$pool == "normal"),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max)
density_0 <- data.frame(samples = rep("ctDNA = 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),]$chrom_triple_max)
density_all <- data.frame(samples = rep("All tumor", nrow(all_data_table[which(all_data_table$pool == "tumor"),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "tumor"),]$chrom_triple_max)
density_above_0 <- data.frame(samples = rep("ctDNA > 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$chrom_triple_max)
density_above_0.05 <- data.frame(samples = rep("ctDNA > 0.05", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$chrom_triple_max)
density_df <- rbind(density_normal, density_all, density_0, density_above_0, density_above_0.05)
#density_df
p <- ggplot(density_df, aes(x=chrom_triple_max, color = samples)) + geom_density() + xlim(0.7,2) + xlab("Mean VAF Z-score for 3 Chromosomes with highest Z-scores ") + scale_color_manual(values=c("tan4", "turquoise3", "mediumorchid2", "lightcoral","yellowgreen"))
p
## Warning: Removed 347 rows containing non-finite values (stat_density).
print("roc_df_0")
## [1] "roc_df_0"
roc_df_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF == 0),], "top_3")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 130
## [1] "Number of tumor samples above max normal vaf z"
## [1] 13
## [1] "Sensitivity"
## [1] 0.1
## [1] "AUC"
## [1] 0.5646481
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "top_3")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Number of tumor samples above max normal vaf z"
## [1] 159
## [1] "Sensitivity"
## [1] 0.3368644
## [1] "AUC"
## [1] 0.7041111
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "top_3")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Number of tumor samples above max normal vaf z"
## [1] 146
## [1] "Sensitivity"
## [1] 0.4269006
## [1] "AUC"
## [1] 0.7571233
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "top_3")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Number of tumor samples above max normal vaf z"
## [1] 107
## [1] "Sensitivity"
## [1] 1
## [1] "AUC"
## [1] 1
roc_df_0$samples <- rep("ctDNA = 0", nrow(roc_df_0))
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_0,roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)
ggplot(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$egfr_het_SNP_count > 7),], aes(x=mean_AF, y= egfr_z_score, color = as.character(updated_egfr_cn))) + geom_point() + ylab("Mean Z-Score for EGFR Heterozygous SNPs") + xlab("Estimated ctDNA Fraction") + ggtitle("EGFR Mean SNP VAF Z-Score vs Predicted ctDNA Fraction and CNV") + scale_color_discrete(name = "Predicted CNV Amplification", labels = c("Amplified in other samples from same patient", "No amplification", "Amplification"), l = 70, c = 150)
print("number of SNPs")
## [1] "number of SNPs"
print(length(which(all_data_table$egfr_het_SNP_count > 7)))
## [1] 401
print("number of SNPs from patients with no flagged EGFR cnv")
## [1] "number of SNPs from patients with no flagged EGFR cnv"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == 0)))
## [1] 390
print("number of SNPs from samples with flagged EGFR cnv")
## [1] "number of SNPs from samples with flagged EGFR cnv"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == 2)))
## [1] 9
print("number of SNPs from samples with no flagged EGFR cnv but patients with flagged egfr CN in other samples")
## [1] "number of SNPs from samples with no flagged EGFR cnv but patients with flagged egfr CN in other samples"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == -1)))
## [1] 2
a <- ggplot() + xlab("Average SNP VAF Z-Score") + ylab("Sample Count")
a <- a + geom_histogram(data = all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, fill=pool, color=pool, position="identity"))
## Warning: Ignoring unknown aesthetics: position
a <- a + geom_histogram(data = all_data_table[which(all_data_table$pool == "normal"),], aes(x=abs_frag_z_score, fill=pool, color=pool, position="identity"),alpha=0.8) + xlab("Mean Fragment Length Z-Score")
## Warning: Ignoring unknown aesthetics: position
ggplot(all_data_table, aes(x=abs_frag_z_score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) + theme(legend.position="top") + xlab("Mean Fragment Length Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(all_data_table, aes(x=abs_frag_z_score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) +
theme(legend.position="top") + xlim(0.1,3) + xlab("Mean Fragment Length Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 61 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
print(a)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
print(b)
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
print("normal min and max")
## [1] "normal min and max"
print(min(all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score))
## [1] 0.191482
print(max(all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score))
## [1] 2.478853
print("tumor min and max")
## [1] "tumor min and max"
print(min(all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score))
## [1] 0.2660128
print(max(all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score))
## [1] 7.053137
p <- ggplot()
p <- p + geom_point(data = clin_total_table[clin_total_table$pool == "tumor",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + geom_point(data = clin_total_table[clin_total_table$pool == "normal",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean Fragment Length Z-score") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score)), color = "purple", linetype="dashed")
print(p)
g <- p + xlim(0,0.2) + ylim(0.1,3) + geom_vline(aes(xintercept=0.05),color = "dark green", linetype="dashed") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score)), color = "purple", linetype="dashed")
print(g)
## Warning: Removed 71 rows containing missing values (geom_point).
density_normal <- data.frame(samples = rep("normal", nrow(all_data_table[which(all_data_table$pool == "normal"),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score)
density_0 <- data.frame(samples = rep("ctDNA = 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),]$abs_frag_z_score)
density_all <- data.frame(samples = rep("All tumor", nrow(all_data_table[which(all_data_table$pool == "tumor"),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score)
density_above_0 <- data.frame(samples = rep("ctDNA > 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$abs_frag_z_score)
density_above_0.05 <- data.frame(samples = rep("ctDNA > 0.05", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$abs_frag_z_score)
density_df <- rbind(density_normal, density_all, density_0, density_above_0, density_above_0.05)
#density_df
p <- ggplot(density_df, aes(x=abs_frag_z_score, color = samples)) + geom_density() + xlim(0,4) + scale_color_manual(values=c("tan4", "turquoise3", "mediumorchid2", "lightcoral","yellowgreen")) + xlab("Fragment Length Z-score")
p
## Warning: Removed 64 rows containing non-finite values (stat_density).
#### ROC curves for classifying samples as clinical cancer patient samples or normal samples
print("roc_df_0")
## [1] "roc_df_0"
roc_df_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF == 0),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 130
## [1] "Number of tumor samples above max frag z"
## [1] 12
## [1] "Sensitivity"
## [1] 0.09230769
## [1] "AUC"
## [1] 0.7607201
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Number of tumor samples above max frag z"
## [1] 91
## [1] "Sensitivity"
## [1] 0.1927966
## [1] "AUC"
## [1] 0.8262261
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Number of tumor samples above max frag z"
## [1] 79
## [1] "Sensitivity"
## [1] 0.2309942
## [1] "AUC"
## [1] 0.851126
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Number of tumor samples above max frag z"
## [1] 47
## [1] "Sensitivity"
## [1] 0.4392523
## [1] "AUC"
## [1] 0.9210579
roc_df_0$samples <- rep("ctDNA = 0", nrow(roc_df_0))
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_0,roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)
print(ggplot(all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = mean_AF)) + geom_point() + ylab("Mean SNP VAF Z-score") + xlab("Mean Fragment Length Z-Score") + labs(color = "Estimated ctDNA Fraction"))
print("Correlation for VAF Z-score and Frag Z Score in all samples")
## [1] "Correlation for VAF Z-score and Frag Z Score in all samples"
print(cor(all_data_table$VAF_Z_Score, all_data_table$abs_frag_z_score))
## [1] 0.5107707
print("Correlation for VAF Z-score and Frag Z Score in tumor samples")
## [1] "Correlation for VAF Z-score and Frag Z Score in tumor samples"
print(cor(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score, all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score))
## [1] 0.5041213
print("Correlation between VAF Z-score and estimated ctDNA fraction")
## [1] "Correlation between VAF Z-score and estimated ctDNA fraction"
print(cor(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score, all_data_table[which(all_data_table$pool == "tumor"),]$mean_AF))
## [1] 0.775439
print("Correlation between fragment length Z-score and estimated ctDNA fraction")
## [1] "Correlation between fragment length Z-score and estimated ctDNA fraction"
print(cor(all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score, all_data_table[which(all_data_table$pool == "tumor"),]$mean_AF))
## [1] 0.5082742
print("Correlation for top 3 chrom VAF and meanAF in all samples")
## [1] "Correlation for top 3 chrom VAF and meanAF in all samples"
print(cor(all_data_table$chrom_triple_max, all_data_table$mean_AF))
## [1] 0.8167251
print("Correlation between VAF Z-score and estimated ctDNA fraction")
## [1] "Correlation between VAF Z-score and estimated ctDNA fraction"
print(cor(all_data_table$VAF_Z_Score, all_data_table$mean_AF))
## [1] 0.7793024
print("Correlation between fragment length Z-score and estimated ctDNA fraction")
## [1] "Correlation between fragment length Z-score and estimated ctDNA fraction"
print(cor(all_data_table$abs_frag_z_score, all_data_table$mean_AF))
## [1] 0.5171402
f <- ggplot() + geom_point(data = all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = pool)) + ylab("Mean SNP VAF Z-score") + xlab("Mean Fragment Length Z-Score") + labs(color = "Pool")
f <- f + geom_point(data = all_data_table[which(all_data_table$pool == "normal"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = pool))
print(f)
print("roc_df_0")
## [1] "roc_df_0"
roc_df_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF == 0),], "frag_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 130
## [1] "Number of tumor samples above max frag z or max VAF frag z"
## [1] 24
## [1] "Sensitivity"
## [1] 0.1846154
## [1] "AUC"
## [1] 0.7818331
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "frag_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Number of tumor samples above max frag z or max VAF frag z"
## [1] 199
## [1] "Sensitivity"
## [1] 0.4216102
## [1] "AUC"
## [1] 0.8588172
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "frag_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Number of tumor samples above max frag z or max VAF frag z"
## [1] 175
## [1] "Sensitivity"
## [1] 0.5116959
## [1] "AUC"
## [1] 0.8880801
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "frag_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Number of tumor samples above max frag z or max VAF frag z"
## [1] 107
## [1] "Sensitivity"
## [1] 1
## [1] "AUC"
## [1] 1
roc_df_0$samples <- rep("ctDNA = 0", nrow(roc_df_0))
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_0,roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)
frag_table <- readRDS("../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table$start <- paste("s",frag_table$start,"_",frag_table$end,sep="")
frag_table_starts_ends <- frag_table$start
frag_table <- frag_table[,-c(1,2)]
frag_table <- t(frag_table)
colnames(frag_table) <- frag_table_starts_ends
frag_table <- as.data.frame(frag_table)
frag_table <- frag_table[,-c(ncol(frag_table))]
frag_table$sample_names <- row.names(frag_table)
frag_table <- normalize_frag_table(frag_table)
frag_table$sample_names <- str_replace(frag_table$sample_names, "_DONOR", "-DONOR")
normal_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "normal"),]$sample_names)
tumor_all_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor"),]$sample_names)
tumor_above_0_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$sample_names)
tumor_above_05_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$sample_names)
normal_frag_table <- frag_table[normal_indices,-c(ncol(frag_table))]
clin_table_all <- frag_table[tumor_all_indices,-c(ncol(frag_table))]
normal_avgs <- colSums(normal_frag_table)/nrow(normal_frag_table)
clin_avgs_all <- colSums(clin_table_all)/nrow(clin_table_all)
all_avgs <- c(normal_avgs,clin_avgs_all)
clin_table_above_0 <- frag_table[tumor_above_0_indices,-c(ncol(frag_table))]
clin_avgs_above_0 <- colSums(clin_table_above_0)/nrow(clin_table_above_0)
clin_table_above_05 <- frag_table[tumor_above_05_indices,-c(ncol(frag_table))]
clin_avgs_above_05 <- colSums(clin_table_above_05)/nrow(clin_table_above_05)
all_avgs_above_0 <- c(normal_avgs,clin_avgs_above_0)
all_avgs_above_05 <- c(normal_avgs,clin_avgs_above_05)
repeated <- length(all_avgs)/2
normal_names <- rep("normal",repeated)
tumor_names <- rep("tumor",repeated)
all_names <- c(normal_names,tumor_names)
intervals <- c(seq(1,595,5),seq(1,595,5))
fragments_df_all <- data.frame(avg_coverage = all_avgs, pool = all_names, bin = intervals)
fragments_df_above_0 <- data.frame(avg_coverage = all_avgs_above_0, pool = all_names, bin = intervals)
fragments_df_above_05 <- data.frame(avg_coverage = all_avgs_above_05, pool = all_names, bin = intervals)
print(ggplot(fragments_df_all, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(0,600))
print(ggplot(fragments_df_above_0, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(0,600))
print(ggplot(fragments_df_above_05, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(0,600))
print(ks.test(normal_avgs, clin_avgs_all))
##
## Two-sample Kolmogorov-Smirnov test
##
## data: normal_avgs and clin_avgs_all
## D = 0.16807, p-value = 0.06937
## alternative hypothesis: two-sided
print(ks.test(normal_avgs, clin_avgs_above_0))
##
## Two-sample Kolmogorov-Smirnov test
##
## data: normal_avgs and clin_avgs_above_0
## D = 0.16807, p-value = 0.06937
## alternative hypothesis: two-sided
print(ks.test(normal_avgs, clin_avgs_above_05))
##
## Two-sample Kolmogorov-Smirnov test
##
## data: normal_avgs and clin_avgs_above_05
## D = 0.14286, p-value = 0.1762
## alternative hypothesis: two-sided
#colnames(normal_frag_table) == colnames(clin_table_all)
p_values <- c()
for(i in 1:ncol(normal_frag_table)){
x <- normal_frag_table[,i]
y <- clin_table_above_05[,i]
v <- ks.test(x, y)
p_values <- c(p_values, v$p.value)
}
#p_values
intervals_p <- seq(1,595,5)
p_cutoff <- 0.05 / length(intervals_p) ##hypothesis correction
abline <- -log( p_cutoff, 10)
fragments_p_values <- data.frame(p_value = p_values, bin = intervals_p)
print(ggplot(fragments_p_values, aes(bin,-log( p_values, 10))) + geom_path() + ylab("-LOG10(p-value)") + xlab("Fragment Length") + geom_hline(yintercept = abline))
#fragments_p_values
#fragments_p_values[which(fragments_p_values$p_value < p_cutoff & normal_avgs > 0.001),] ##abaove .1% for bin covg
fragment_bins_to_analyze <- which(fragments_p_values$p_value < p_cutoff & normal_avgs > 0.001)
fragment_above_001 <- which(normal_avgs > 0.001)
##change p
saveRDS(fragment_bins_to_analyze, "../Gaudin_thesis_data/fragment_bins_to_analyze.rds")
tsne_table_sd_scaled <- frag_table[c(normal_indices,tumor_all_indices),fragment_above_001]
#tsne_table_sd_scaled <- frag_table[,fragment_bins_to_analyze]
for(i in 1:ncol(tsne_table_sd_scaled)){
min <- min(tsne_table_sd_scaled[,i])
max <- max(tsne_table_sd_scaled[,i])
max_m_min <- max - min
tsne_table_sd_scaled[,i] <- (tsne_table_sd_scaled[,i] - min)/max_m_min
}
value_vector <- rep("normal",nrow(frag_table))
value_vector[tumor_all_indices] <- "ctDNA_est_0"
value_vector[tumor_above_0_indices] <- "ctDNA_above_0"
value_vector[tumor_above_05_indices] <- "ctDNA_above_05"
value_vector <- value_vector[c(normal_indices,tumor_all_indices)]
#make colors and labels vectors
color_vector <- value_vector
color_vector[which(color_vector == "normal")] <- "2" #red
color_vector[which(color_vector == "ctDNA_est_0")] <- "3" #green
color_vector[which(color_vector == "ctDNA_above_0")] <- "4" #blue
color_vector[which(color_vector == "ctDNA_above_05")] <- "1" #black
color_vector <- as.numeric(color_vector)
label_vector <- color_vector
label_vector[] <- "*"
#make TSNE
train <- as.matrix(tsne_table_sd_scaled)
tsne <- Rtsne(train, dims = 2, perplexity=30, verbose=FALSE, max_iter = 3000, check_duplicates=FALSE)
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels = label_vector, col = color_vector)
##may need to adjust legend position (x, y are first two parameters)
legend(-20, -5, legend=c("normal", "ctDNA_est_0","ctDNA_above_0", "ctDNA_above_05" ),col=c("red", "green", "blue", "black"), lwd = 1, box.lty=0, box.lwd=0)
###### PCA
pca <- prcomp(t(tsne_table_sd_scaled), scale = FALSE)
pca <- as.data.frame(pca[2]$rotation)
pca$status <- value_vector
print(ggplot(pca) + geom_point(aes(PC1,PC2, fill = status, color = status)))
log_vf_frag_df <- data.frame(pool = all_data_table[which(all_data_table$mean_AF == 0),]$pool, VAF_Z_Score = all_data_table[which(all_data_table$mean_AF == 0),]$VAF_Z_Score, abs_frag_z_score = all_data_table[which(all_data_table$mean_AF == 0),]$abs_frag_z_score, chrom_triple_max = all_data_table[which(all_data_table$mean_AF == 0),]$chrom_triple_max)
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score + chrom_triple_max, data = log_vf_frag_df, family = binomial)
glm.probs <- predict(glm.fit,type = "response")
d <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.7915
ci.se(d,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
## sp se.low se.median se.high
## 1 0.1077 0.1769 0.4
#plot(d, asp = NA, xlim=c(1, 0), ylim = c(0,1), col="purple")
log_vf_frag_df <- data.frame(pool = all_data_table$pool, VAF_Z_Score = all_data_table$VAF_Z_Score, abs_frag_z_score = all_data_table$abs_frag_z_score, chrom_triple_max = all_data_table$chrom_triple_max)
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score + chrom_triple_max, data = log_vf_frag_df, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fit,type = "response")
e <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.8572
ci.se(e,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
## sp se.low se.median se.high
## 1 0.3432 0.4004 0.5381
log_vf_frag_df <- data.frame(pool = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$pool, VAF_Z_Score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$VAF_Z_Score, abs_frag_z_score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$abs_frag_z_score, chrom_triple_max = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$chrom_triple_max)
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score + chrom_triple_max, data = log_vf_frag_df, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fit,type = "response")
f <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.8866
ci.se(f,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
## sp se.low se.median se.high
## 1 0.4269 0.4912 0.6374
log_vf_frag_df <- data.frame(pool = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$pool, VAF_Z_Score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$VAF_Z_Score, abs_frag_z_score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$abs_frag_z_score, chrom_triple_max = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$chrom_triple_max)
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score + chrom_triple_max, data = log_vf_frag_df, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fit,type = "response")
g <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 1
plot(d, asp = NA, xlim=c(1, 0), ylim = c(0,1), col="purple")
lines(g, col="green",lty=1)
lines(e, col="red",lty=1)
lines(f, col="blue",lty=1)
legend(0.4,0.4,legend=c("ctDNA = 0", "all samples","ctDNA above 0","ctDNA above 0.05"), col=c("purple","red","blue","green"), lty = 1)
ci.se(g,specificities = c(1))
## Warning in ci.se.roc(g, specificities = c(1)): ci.se() of a ROC curve with
## AUC == 1 is always a null interval and can be misleading.
## 95% CI (2000 stratified bootstrap replicates):
## sp se.low se.median se.high
## 1 1 1 1